# 安装依赖
# pip install python-docx pdfplumber unstructured pandoc

def extract_content(file_path):
    content = []
    if file_path.endswith('.pdf'):
        import pdfplumber
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                # 提取文本
                text = page.extract_text()
                if text:
                    content.append(text)
                # 提取表格
                tables = page.extract_tables()
                for table in tables:
                    md_table = "\n".join(["| " + " | ".join(row) + " |" for row in table])
                    content.append(f"[TABLE]\n{md_table}\n[/TABLE]")

    elif file_path.endswith('.docx'):
        from docx import Document
        doc = Document(file_path)
        for para in doc.paragraphs:
            content.append(para.text)
        for table in doc.tables:
            md_table = "\n".join(
                "| " + " | ".join(cell.text for cell in row.cells) + " |"
                for row in table.rows
            )
            content.append(f"[TABLE]\n{md_table}\n[/TABLE]")

    elif file_path.endswith('.txt'):
        with open(file_path, 'r') as f:
            content.append(f.read())

    return "\n\n".join(content)


if __name__ == '__main__':
    file_path = 'E:\\lihz\\10-Library\\08_webServer\\k_khlc2025\\test1.docx'  # 替换为你的文件路径
    content = extract_content(file_path)
    print(content)